library(vegan)
library(ggplot2)
library(cluster)
library(RColorBrewer)
library(ggalluvial)
library(reshape2)
library(ggsci)
library(randomcoloR)
require(cowplot)
library(ggpubr)
library(ggrepel)
library(gridExtra)
library(plotly)
CompositionTable <- function(x,n){ # change the metaphlan result to a composition table, select top n most abundant features
  
  require(foreach)
  x[is.na(x)]=0
  mean_value <- data.frame(Taxa=colnames(x), Mean_abundance=colSums(x)/nrow(x))
  most <- as.character(mean_value[order(mean_value$Mean_abundance,decreasing = T),]$Taxa[1:n])
  #print(paste("Most abundant taxa is",most,sep = " "))
  
  composition_table <- foreach(i=1:length(most),.combine = rbind) %do%  {
    return.string = data.frame(ID = rownames(x), Relative=x[,most[i]],Level=colnames(x[,most[i],drop=F]))
  }
  
  first <- composition_table[grep(most[1],composition_table$Level),]
  first <- first[order(first$Relative,decreasing = T),]
  level <- as.factor(first$ID)
  composition_table$ID <- factor(composition_table$ID,levels = level)
  
  return(composition_table)
}

AverageTable <- function(compositiontable){
  
  averageTable=matrix(nrow = 1,ncol = length(unique(compositiontable$Level)))
  rownames(averageTable)=deparse(substitute(compositiontable))
  colnames(averageTable)=c(as.character(unique(compositiontable$Level)))
  
  for(i in unique(compositiontable$Level)){
    subset=compositiontable$Relative[compositiontable$Level==i]
    avr=mean(subset[!is.na(subset)])
    averageTable[1,i]=avr
  }
  averageTable=as.data.frame(averageTable)
  averageTable$Others=1-sum(averageTable[!is.na(averageTable)])
  averageTable=as.data.frame(t(averageTable))
  averageTable$Taxa=rownames(averageTable)
  
  return(averageTable)
}

Load data and rename

apk=read.table("APK_metaphlan_merged.txt",sep = "\t",header = T,row.names = 1,
               stringsAsFactors = F,check.names = F)
fsk=read.table("FSK_metaphlan_merged.txt",sep = "\t",header = T,row.names = 1,
                stringsAsFactors = F,check.names = F)
apk=as.data.frame(t(apk/100))
fsk=as.data.frame(t(fsk/100))
rownames(apk)=gsub("_metaphlan","",rownames(apk))
rownames(fsk)=gsub("_metaphlan","",rownames(fsk))
coupling=read.table("APK_to_FSK_key.txt")
colnames(coupling)=c("APK","FSK")
coupling$Name=paste("Sample",seq(1,297),sep = "_")
apk=apk[rownames(apk) %in% coupling$APK,]
fsk=fsk[rownames(fsk) %in% coupling$FSK,]
apk=apk[order(rownames(apk)),]
coupling=coupling[order(coupling$APK),]
rownames(apk)=coupling$Name
fsk=fsk[order(rownames(fsk)),]
coupling=coupling[order(coupling$FSK),]
rownames(fsk)=coupling$Name

Keep genus and data filtering

# keep only genus
apk_genus=apk[,grep("g__",colnames(apk))]
apk_genus=apk_genus[,grep("s__",colnames(apk_genus),invert = T)]
colnames(apk_genus)=lapply(colnames(apk_genus),function(x){
  strsplit(x,"g__")[[1]][2]
})
fsk_genus=fsk[,grep("g__",colnames(fsk))]
fsk_genus=fsk_genus[,grep("s__",colnames(fsk_genus),invert = T)]
colnames(fsk_genus)=lapply(colnames(fsk_genus),function(x){
  strsplit(x,"g__")[[1]][2]
})

# remove low present bacteria and recaculate relative abundance 
cutoff=0.1
apk_genus = apk_genus[,colSums(apk_genus > 0) >= cutoff*nrow(apk_genus)]
for(i in 1:nrow(apk_genus)){
  apk_genus[i,]=apk_genus[i,]/sum(apk_genus[i,])
}
fsk_genus = fsk_genus[,colSums(fsk_genus > 0) >= cutoff*nrow(fsk_genus)]
for(i in 1:nrow(fsk_genus)){
  fsk_genus[i,]=fsk_genus[i,]/sum(fsk_genus[i,])
}

Composition compare between two methods

apk_table=CompositionTable(apk_genus,ncol(apk_genus))
fsk_table=CompositionTable(fsk_genus,ncol(fsk_genus))
apk_average=AverageTable(apk_table)
fsk_average=AverageTable(fsk_table)

intersect_list=intersect(rownames(fsk_average),rownames(apk_average))
intersect_list=intersect_list[intersect_list!="Others"]
apk_genus=apk_genus[order(rownames(apk_genus)),]
fsk_genus=fsk_genus[order(rownames(fsk_genus)),]
intersect=data.frame(Overlapp=intersect_list,FoldChange=NA,Pvalue=NA,FDR=NA)
for(i in 1:nrow(intersect)){
  bug=as.character(intersect$Overlapp[i])
  intersect$FoldChange[i]=apk_average[bug,]$apk_table/fsk_average[bug,]$fsk_table
  mm=wilcox.test(fsk_genus[,bug],apk_genus[,bug],paired = T)
  intersect$Pvalue[i]=mm$p.value
}
intersect$FClog=log2(intersect$FoldChange)
intersect$FDR=p.adjust(intersect$Pvalue)
intersect$threshold[intersect$FDR<0.05]="Significant"
intersect$threshold[intersect$FDR>0.05]="Non-Significant"

a <- list()
for (i in seq_len(nrow(intersect))) {
  m <- intersect[i, ]
  a[[i]] <- list(
    x = m[["FClog"]],
    y = -log10(m[["FDR"]]),
    text = m[["Overlapp"]],
    xref = "x",
    yref = "y",
    showarrow = F,
    ax = 20,
    ay = -40
  )
}
p <- plot_ly(data = intersect, x = ~FClog, y = ~(-log10(FDR)), 
             text = ~Overlapp, mode = "markers", color = ~threshold,
             marker=list(size=5),alpha=0.5,colors=c("#6F99ADFF","#BC3C29FF")) %>% 
  layout(title ="Volcano Plot") %>%
  layout(annotations = a)
p

Compare correlation among samples of Prevotella and Bacteroides

compare_prevotella=merge(apk_genus[,"Prevotella",drop=F],fsk_genus[,"Prevotella",drop=F],by="row.names",all=F)
colnames(compare_prevotella)=c("ID","apk","fsk")
compare_prevotella=na.omit(compare_prevotella)
cor.test(compare_prevotella$apk,compare_prevotella$fsk,method = "spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  compare_prevotella$apk and compare_prevotella$fsk
## S = 708270, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8344662
p1=ggplot(compare_prevotella, aes(x=apk, y=fsk)) + 
  geom_point()+ggtitle("Correlation")+
  geom_smooth(method=lm)
p2=ggplot(data=compare_prevotella, aes(log(apk))) + 
  geom_histogram(fill="grey")+ggtitle("Prevotella in APK")
p3=ggplot(data=compare_prevotella, aes(log(fsk))) + 
  geom_histogram(fill="grey")+ggtitle("Prevotella in FSK")
  
compare_bacteroides=merge(apk_genus[,"Bacteroides",drop=F],fsk_genus[,"Bacteroides",drop=F],by="row.names",all=F)
colnames(compare_bacteroides)=c("ID","apk","fsk")
compare_bacteroides=na.omit(compare_bacteroides)
cor.test(compare_bacteroides$apk,compare_bacteroides$fsk,method = "spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  compare_bacteroides$apk and compare_bacteroides$fsk
## S = 2062200, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5180287
p4=ggplot(compare_bacteroides, aes(x=apk, y=fsk)) + 
  geom_point()+ggtitle("Correlation")+
  geom_smooth(method=lm)
p5=ggplot(data=compare_bacteroides, aes(log(apk))) + 
  geom_histogram(fill="grey")+ggtitle("Bacteroides in APK")
p6=ggplot(data=compare_bacteroides, aes(log(fsk))) + 
  geom_histogram(fill="grey")+ggtitle("Bacteroides in FSK")
ggarrange(p2,p3,p1,p5,p6,p4,ncol = 3,nrow = 2)

Compare PCoA

# remove NA samples
apk_genus=na.omit(apk_genus)
fsk_genus=na.omit(fsk_genus)
fsk_genus=fsk_genus[rownames(fsk_genus) %in% rownames(apk_genus),]

beta_diversity=vegdist(apk_genus,method = "bray")
pcoa_analysis=as.data.frame(cmdscale(beta_diversity,k=4))
pcoa_analysis=merge(pcoa_analysis,apk_genus[,c("Bacteroides","Prevotella","Ruminococcus"),drop=F],by="row.names",all=F)
rownames(pcoa_analysis)=pcoa_analysis$Row.names
pcoa_analysis[,1]=NULL

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(200))
p1=ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Bacteroides),size=2) + theme_bw()+sc+theme(legend.position = 'top')
p2=ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Prevotella),size=2) + theme_bw()+sc+theme(legend.position = 'top')
p3=ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Ruminococcus),size=2) + theme_bw()+sc+theme(legend.position = 'top')

beta_diversity=vegdist(fsk_genus,method = "bray")
pcoa_analysis=as.data.frame(cmdscale(beta_diversity,k=4))
pcoa_analysis=merge(pcoa_analysis,fsk_genus[,c("Bacteroides","Prevotella","Ruminococcus"),drop=F],by="row.names",all=F)
rownames(pcoa_analysis)=pcoa_analysis$Row.names
pcoa_analysis[,1]=NULL

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(200))
p4=ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Bacteroides),size=2) + theme_bw()+sc+theme(legend.position = 'top')
p5=ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Prevotella),size=2) + theme_bw()+sc+theme(legend.position = 'top')
p6=ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Ruminococcus),size=2) + theme_bw()+sc+theme(legend.position = 'top')
# APK vs FSK, colored by Bacteroides
ggarrange(p1,p4,ncol = 2)

# APK vs FSK, colored by Prevotella
ggarrange(p2,p5,ncol = 2)

# APK vs FSK, colored by Ruminococcus
ggarrange(p3,p6,ncol = 2)

Remove Prevotella from FSK samples

fsk_genus_noP=fsk_genus[,colnames(fsk_genus)!="Prevotella"]
beta_diversity=vegdist(fsk_genus_noP,method = "bray")
pcoa_analysis=as.data.frame(cmdscale(beta_diversity,k=4))
pcoa_analysis=merge(pcoa_analysis,fsk_genus_noP[,c("Bacteroides","Ruminococcus"),drop=F],by="row.names",all=F)
rownames(pcoa_analysis)=pcoa_analysis$Row.names
pcoa_analysis[,1]=NULL

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(200))
p1=ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Bacteroides),size=2) + theme_bw()+sc+theme(legend.position = 'top')
p3=ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Ruminococcus),size=2) + theme_bw()+sc+theme(legend.position = 'top')
ggarrange(p1,p3,ncol = 2)

LLD sample

# LLD 920 samples
lld_taxa=read.table("LLD_taxonomy_metaphlan2_092017.txt",sep = "\t",header = T,check.names = F,
                    stringsAsFactors = F,row.names = 1)
lld_id_change=read.table("eqtl_LLD_linkage.txt",sep = "\t")
lld_taxa=lld_taxa[,colnames(lld_taxa) %in% lld_id_change$V2]
colnames(lld_taxa)=lld_id_change$V1
lld_taxa=as.data.frame(t(lld_taxa))

# keep only genus and data filtering
genus=lld_taxa[,grep("g__",colnames(lld_taxa))]
genus=genus[,grep("s__",colnames(genus),invert = T)]
colnames(genus)=lapply(colnames(genus),function(x){
  strsplit(x,"g__")[[1]][2]
})
cutoff=0.1
genus = genus[,colSums(genus > 0) >= cutoff*nrow(genus)]
for(i in 1:nrow(genus)){
  genus[i,]=genus[i,]/sum(genus[i,])
}
genus[is.na(genus)]=0
beta_diversity=vegdist(genus,method = "bray")
## Warning in vegdist(genus, method = "bray"): you have empty rows: their
## dissimilarities may be meaningless in method "bray"
pcoa_analysis=as.data.frame(cmdscale(beta_diversity,k=4))
pcoa_analysis=merge(pcoa_analysis,genus[,c("Bacteroides","Prevotella","Ruminococcus"),drop=F],by="row.names",all=F)
rownames(pcoa_analysis)=pcoa_analysis$Row.names
pcoa_analysis[,1]=NULL

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(200))
ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Bacteroides),size=2) + theme_bw()+sc

ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Prevotella),size=2) + theme_bw()+sc

ggplot (pcoa_analysis, aes(V1,V2)) + geom_point(aes(colour =Ruminococcus),size=2) + theme_bw()+sc